"""
先知爬虫
"""
from library import library as clib
from crawler import baseScrawler
from bs4 import BeautifulSoup
from datetime import datetime
import re

# 全局变量
connect = clib.Connect()


class xianzhi(baseScrawler.baseScrawler):
    # init：当前 ---> xxxx-xx-xx时间
    def __init__(self, start_date, num):
        super().__init__(start_date, num);
        self.mainUrl = "https://xz.aliyun.com"
        self.flag = '年度贡献榜'
        self.flag_page_view = '浏览数'

    # 解析html，html --> 字典 --> 总的列表，若是日期超过上限，则mark=1，停止加入final_all列表
    def html_parse(self, html):
        soup = BeautifulSoup(html, "html.parser")
        article_list = soup.find_all("tr")

        for item in article_list:
            dic = {}

            p1 = item.find_all("p")[0]
            p2 = item.find_all("p")[1]
            article_link = self.mainUrl + p1.a.attrs['href']
            article_tile = p1.a.string.lstrip()
            author_link = self.mainUrl + p2.find_all("a")[0].attrs['href']
            author_name = p2.find_all("a")[0].string
            classification = p2.find_all("a")[1].string
            time = p2.text.split("\n")[6][7:]
            html_tmp = connect.siteConnect(article_link, self.flag_page_view)
            soup_tmp = BeautifulSoup(html_tmp, "html.parser")
            page_view = int(soup_tmp.find_all(text=re.compile(self.flag_page_view + ' \d+'))[0][4:])

            dic['article_link'] = article_link
            dic['article_title'] = article_tile
            dic['author_link'] = author_link
            dic['author_name'] = author_name
            dic['classification'] = classification
            dic['time'] = time
            dic['page_view'] = page_view

            time = datetime.strptime(dic['time'], "%Y-%m-%d")
            if time <= self.start_date:
                self.mark = 1
                break
            elif self.unique(dic):
                self.final_all.append(dic)
                self.allurl.append(dic.get("author_link"))

    # 启动，返回最终的final_all列表，若失败，则返回空列表
    def start(self):
        page = 0

        while self.mark == 0:
            page = page + 1
            currentUrl = self.mainUrl + "/?page=" + str(page)
            ures = connect.siteConnect(currentUrl, self.flag)

            if ures.find('4ny0neSec_wrong') == -1:
                self.html_parse(ures)
            else:
                # 可以将失败的存到日志里hhddj
                print("-----------------" + currentUrl + " fail-----------------")
        self.res = self.filter.views_first(self.final_all, self.num)


